library(magrittr)
library(ggplot2)
library(gridExtra)
library(ggcorrplot)

source("../R/visualizations.R")
source("../R/feature_definitions.R")

training_set <- read.csv("../preprocessed_training_data.csv", row.names = 1, as.is = TRUE)
outcome <- read.csv("../training_outcomes.csv", row.names = 1)[,1]

stopifnot(row.names(training_set) == row.names(outcome))

features <- colnames(training_set)

Correlations

Plot correlation matrices of missingness indicators against missingness indicators, observed values against observed values, and missingness indicators against observed values.

positive_data <- training_set[outcome == "positive", ]
negative_data <- training_set[outcome == "negative", ]

# Missingness indicator correlations
plot_missingness_correlations(training_set, numeric_features, "Missingness indicator correlations")
## Warning in cor(miss_data[, features]): the standard deviation is zero

plot_missingness_correlations(positive_data, numeric_features, "Missingness indicator correlations (positive-labeled)")
## Warning in cor(miss_data[, features]): the standard deviation is zero

plot_missingness_correlations(negative_data, numeric_features, "Missingness indicator correlations (negative-labeled)")
## Warning in cor(miss_data[, features]): the standard deviation is zero

# Observed value correlations
plot_observed_correlations(training_set, numeric_features, "Correlations of observed values")
## Warning in cor(data[, features], use = "pairwise.complete.obs"): the
## standard deviation is zero

plot_observed_correlations(positive_data, numeric_features, "Correlations of observed values (positive-labeled)")
## Warning in cor(data[, features], use = "pairwise.complete.obs"): the
## standard deviation is zero

plot_observed_correlations(negative_data, numeric_features, "Correlations of observed values (negative-labeled)")
## Warning in cor(data[, features], use = "pairwise.complete.obs"): the
## standard deviation is zero

# Missingness vs. observed correlations
plot_missingness_vs_observed_correlations(training_set, numeric_features, "Missingness correlations vs. observed values")
## Warning in cor(data, miss_data, use = "pairwise.complete.obs"): the
## standard deviation is zero

plot_missingness_vs_observed_correlations(positive_data, numeric_features, "Missingness correlations vs. observed values (positive-labeled)")
## Warning in cor(data, miss_data, use = "pairwise.complete.obs"): the
## standard deviation is zero

plot_missingness_vs_observed_correlations(negative_data, numeric_features, "Missingness correlations vs. observed values (negative-labeled)")
## Warning in cor(data, miss_data, use = "pairwise.complete.obs"): the
## standard deviation is zero

Feature value distributions

Next, plot distributions of each feature. Are they normal or linear?

feature_distribution_plots <- lapply(numeric_features,
                                     function(column) {
                                       ggplot2::quickplot(
                                         na.omit(training_set[,column]),
                                         main = column,
                                         xlab = "",
                                         bins = 30
                                       )
                                     })

marrangeGrob(
  ncol = 2, nrow = 3,
  grobs = feature_distribution_plots
)

They are not, and thus it might be worth considering data transformations. In the case of random forest, however, monotone transformations should have no effect.

Categorical level occurence counts

Print (one-dimensional) contingency tables, i.e. occurence counts of each level of categorical variables.

for (cat_feat in categorical_features) {
  table(training_set[, cat_feat, drop = FALSE], dnn = cat_feat, useNA = "always") %>% as.data.frame %>% print
}
##   LRT_pred Freq
## 1        D  382
## 2        N  162
## 3        U    4
## 4     <NA> 2044
##   Dst2SplType Freq
## 1    ACCEPTOR  115
## 2       DONOR  139
## 3        <NA> 2338
##      Consequence.x Freq
## 1       3PRIME_UTR   22
## 2       5PRIME_UTR   17
## 3       DOWNSTREAM  458
## 4         INTRONIC  794
## 5   NON_SYNONYMOUS  580
## 6 NONCODING_CHANGE   13
## 7      SPLICE_SITE   73
## 8         UPSTREAM  635
## 9             <NA>    0

Heatmap of feature missingness against consequence

It is likely that missing values are more or less common in some variables depending on the predicted consequence. This can be visualized by a heatmap:

missing_value_sum_per_consequence <- lapply(training_set[, c(numeric_features, categorical_features), drop = FALSE],
                                            function(column) {
                                              sapply(
                                                split(is.na(column), training_set$Consequence.x),
                                                sum
                                              )
                                            })
missing_value_sum_per_consequence %<>% data.frame %>% as.matrix
heatmap(missing_value_sum_per_consequence)

Non-synonymous variants have much less missingness in certain variables and more in others (as expected).

Compute number of observed missingness patterns

missingness_patterns <- training_set[, c(numeric_features, categorical_features)] %>% is.na
unique_missingness_patterns <- missingness_patterns %>% unique
num_missingness_patterns <- unique_missingness_patterns %>% nrow
print(paste(num_missingness_patterns, "out of", 2^length(c(numeric_features, categorical_features)), "possible missingness patterns."))
## [1] "118 out of 72057594037927936 possible missingness patterns."
missingness_pattern_factor <- apply(missingness_patterns, MARGIN = 1, function(x) paste0(as.integer(x), collapse = "")) %>% factor
rows_per_missingness_pattern <- table(missingness_pattern_factor)
rows_per_missingness_pattern <- rows_per_missingness_pattern %>% as.data.frame
rows_per_missingness_pattern[order(rows_per_missingness_pattern$Freq, decreasing = TRUE),]